underscore_to_space <- function(x) str_replace_all(x, "_", " ")
outliers <- read_tsv("../input_data/druggable_outliers_from_treehouse_and_other_cohorts_2023_11_09-13_46_32_2023.tsv") %>%
  mutate(high_level_cohort = ifelse(str_detect(comparison_cohort, "Treehouse"),
                                    "Treehouse",
                                    comparison_cohort))
## Rows: 287 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sample_ID, comparison_cohort, gene, donor_ID
## lgl (1): pathway_support
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Define cohort codes

cohort_codes <- tibble(
  cohort_name = 
    c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc", "Treehouse_pd"),
  cohort_code = 
    c("P", "T", "S", "C", "D"))

Tile plot of all outliers

ggplot(outliers) +
  geom_tile(aes(x=comparison_cohort,
                y=gene, 
                fill = comparison_cohort)) +
  facet_wrap(~Sample_ID,
             nrow = 1) +
  theme(#axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    axis.text.x = element_blank(),
    strip.text.x = element_text(angle = 90),
        ) +
  xlab("")  +
  scale_fill_bright()

Heatmap shows number of cohorts in which outlier were detected

I can make this look better if we decide to use it, but it’s non-trivial

outliers_heatmap_data <- outliers %>%
  group_by(Sample_ID, gene) %>%
  summarize(n_outliers = n()) 
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
ggplot(outliers_heatmap_data) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = n_outliers), 
            color = "black")  +
  #theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 

library(ggVennDiagram)
raw_outliers_for_venn <- outliers %>%
  mutate(sample_gene = paste(Sample_ID, gene, sep = "_")) %>%
  arrange(comparison_cohort) %>%
  select(sample_gene, comparison_cohort) %>%
  group_split(comparison_cohort)


list_of_outliers_for_venn <-  lapply(raw_outliers_for_venn, function(x) x %>% pull(sample_gene))
names(list_of_outliers_for_venn) <- unique(outliers$comparison_cohort) %>% sort

ggVennDiagram(list_of_outliers_for_venn,
              show_intersect = TRUE)
## Warning in geom_text(aes_string(label = "count", text = "text"), x =
## label_coord[, : Ignoring unknown aesthetics: text
ggVennDiagram(list_of_outliers_for_venn) + 
  scale_fill_distiller(palette = "Reds", direction = 1)

export list of genes found only by TCGA

outliers %>%
  group_by(Sample_ID, gene) %>%
  filter(length(comparison_cohort) == 1,
         "TCGA" %in% comparison_cohort) %>%
  ungroup %>%
  select(gene) %>% write_tsv("../gather_input_data/genes found only by TCGA in at least one sample.txt")

Annotate with combined full low level cohort names

collapse_fun <- function(x){ paste(x,collapse = ", ") }

all_outliers_combined_wide <- outliers %>%
  select(-pathway_support, -donor_ID, -high_level_cohort) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = comparison_cohort,
              values_fn = collapse_fun)

n_distinct(outliers$Sample_ID)
## [1] 34
n_distinct(outliers$gene)
## [1] 56
all_outliers_combined_long <- all_outliers_combined_wide %>%
  pivot_longer(-gene,
               names_to = "Sample_ID",
               values_to = "comparison_cohorts") %>%
  na.omit()

How many outliers are present in each combination of cohorts?

tabyl(all_outliers_combined_long,
      comparison_cohorts) %>%
  arrange(desc(n)) %>%
  adorn_pct_formatting() %>%
  adorn_totals() %>%
  kbl() %>%
  kable_styling(full_width = F)
comparison_cohorts n percent
TCGA, Treehouse_pc 27 20.8%
TCGA 21 16.2%
Treehouse_pd 13 10.0%
TCGA, TH03_TH34, Treehouse_pc 12 9.2%
TH03_TH34 11 8.5%
PEDAYA, TCGA, TH03_TH34, Treehouse_pc 9 6.9%
TCGA, TH03_TH34, Treehouse_pc, Treehouse_pd 8 6.2%
PEDAYA, TCGA, TH03_TH34, Treehouse_pc, Treehouse_pd 7 5.4%
PEDAYA 5 3.8%
TCGA, Treehouse_pc, Treehouse_pd 4 3.1%
TCGA, TH03_TH34 3 2.3%
TCGA, Treehouse_pd 3 2.3%
PEDAYA, TCGA, Treehouse_pc, Treehouse_pd 2 1.5%
PEDAYA, TCGA, TH03_TH34 1 0.8%
PEDAYA, TCGA, Treehouse_pc 1 0.8%
PEDAYA, Treehouse_pc 1 0.8%
TH03_TH34, Treehouse_pc 1 0.8%
TH03_TH34, Treehouse_pd 1 0.8%
Total 130

Tile plot of combination of outliers

ggplot(all_outliers_combined_long) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = comparison_cohorts))  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

n_distinct(all_outliers_combined_long$Sample_ID)                
## [1] 34

Annotate with combined full high level cohort names

high_level_all_outliers_combined_wide <- outliers %>%
  select(-pathway_support, -donor_ID, -comparison_cohort) %>%
  distinct() %>%
  pivot_wider(names_from = Sample_ID,
              values_from = high_level_cohort,
              values_fn = collapse_fun)

n_distinct(outliers$Sample_ID)
## [1] 34
n_distinct(outliers$gene)
## [1] 56
high_level_all_outliers_combined_long <- high_level_all_outliers_combined_wide %>%
  pivot_longer(-gene,
               names_to = "Sample_ID",
               values_to = "comparison_cohorts") %>%
  na.omit()

How many outliers are present in each high level combination of cohorts?

tabyl(high_level_all_outliers_combined_long,
      comparison_cohorts) %>%
  arrange(desc(n)) %>%
  adorn_pct_formatting() %>%
  adorn_totals() %>%
  kbl() %>%
  kable_styling(full_width = F)
comparison_cohorts n percent
TCGA, Treehouse 34 26.2%
TCGA 21 16.2%
TCGA, TH03_TH34, Treehouse 20 15.4%
PEDAYA, TCGA, TH03_TH34, Treehouse 16 12.3%
Treehouse 13 10.0%
TH03_TH34 11 8.5%
PEDAYA 5 3.8%
PEDAYA, TCGA, Treehouse 3 2.3%
TCGA, TH03_TH34 3 2.3%
TH03_TH34, Treehouse 2 1.5%
PEDAYA, TCGA, TH03_TH34 1 0.8%
PEDAYA, Treehouse 1 0.8%
Total 130

Annotate with minimal combined cohort abbreviations

collapse_fun_no_coma <- function(x){ paste(x,collapse = "") }

# backslashes prevent asterisks from being interpreted as italics in the kbl table

all_outliers_min_abbrev_combined_wide <- outliers %>%
  left_join(cohort_codes,
            by=c("comparison_cohort"="cohort_name")) %>%
  mutate(cohort_code_pathway = ifelse(pathway_support,
                                      paste0(cohort_code, "\\*"),
                                      cohort_code)) %>%
  select(-pathway_support, -donor_ID,
         -comparison_cohort,
         -cohort_code) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = cohort_code_pathway,
              values_fn = collapse_fun_no_coma,
              values_fill = "")


all_outliers_min_abbrev_combined_wide %>%
  arrange(gene) %>%
  rename_all(underscore_to_space) %>%
  kbl() %>%
  kable_styling(full_width = F,
                bootstrap_options = "bordered")
gene high level cohort TH34 1162 S01 TH34 1149 S02 TH34 1238 S01 TH34 1349 S01 TH34 1349 S02 TH34 1379 S01 TH34 1380 S01 TH34 1150 S02 TH34 1399 S01 TH34 1400 S01 TH34 1412 S01 TH34 1414 S01 TH34 1415 S01 TH34 1444 S01 TH34 1452 S01 TH34 2292 S01 TH34 2351 S01 TH34 2411 S01 TH34 2666 S01 TH34 1163 S01 TH34 1179 S01 TH34 1239 S01 TH34 1350 S01 TH34 1351 S01 TH34 1352 S01 TH34 1381 S01 TH34 1445 S02 TH34 1446 S01 TH34 1447 S01 TH34 1447 S02 TH34 1455 S01 TH34 1456 S02 TH34 2293 S01 TH34 2410 S01
AKT1 TH03_TH34 S* S*
AKT1 Treehouse D*
AKT2 PEDAYA P*
AKT2 TCGA T*
AKT2 TH03_TH34 S*
AKT2 Treehouse C*D*
ALK PEDAYA P
ALK TCGA T
ALK TH03_TH34 S
ALK Treehouse C
BCL6 TCGA T*
BCL6 Treehouse D
BTK TCGA T* T* T*
BTK TH03_TH34 S*
BTK Treehouse C* C* C*
CCND1 TCGA T*
CCND1 TH03_TH34 S*
CCND2 Treehouse D* D
CCND3 TCGA T* T*
CCNE1 Treehouse D*
CDK4 PEDAYA P*
CDK4 TCGA T* T*
CDK4 TH03_TH34 S* S*
CDK4 Treehouse C* C*D*
CDK9 TCGA T* T*
CDK9 TH03_TH34 S
CDK9 Treehouse D C
CSF1R Treehouse D*
DEPTOR TH03_TH34 S*
ETV1 TCGA T T* T T
ETV1 Treehouse C* C*
FGFR1 TCGA T* T* T*
FGFR1 Treehouse C* C*
FGFR2 TCGA T*
FGFR3 PEDAYA P
FGFR3 TCGA T* T
FGFR3 TH03_TH34 S*
FGFR4 PEDAYA P P P* P P
FGFR4 Treehouse C
FLT4 PEDAYA P*
FLT4 TCGA T T* T
FLT4 TH03_TH34 S S*
FLT4 Treehouse CD C* C
GATA2 TCGA T T*
GATA2 Treehouse CD*
HDAC4 PEDAYA P
HDAC4 TCGA T*
HDAC4 TH03_TH34 S*
HDAC4 Treehouse CD*
HDAC7 Treehouse D*
HMOX1 PEDAYA P P*
HMOX1 TCGA T T* T* T* T* T*
HMOX1 Treehouse CD* C*D* C*D C* D* D* C
HSP90B1 TCGA T*
HSP90B1 TH03_TH34 S*
HSP90B1 Treehouse C*D*
IGF1 PEDAYA P* P* P*
IGF1 TCGA T* T*
IGF1 TH03_TH34 S*
IGF1 Treehouse C*D C*
IGF2 TCGA T* T* T* T T* T* T* T* T* T T* T* T* T* T T T* T*
IGF2 Treehouse C* D* C* C* C* C C* C* C C* D* C C C*
IL6 PEDAYA P*
IL6 TCGA T*
IL6 TH03_TH34 S*
IL6 Treehouse C*
JAK1 PEDAYA P
JAK1 TCGA T T
JAK1 TH03_TH34 S S
JAK1 Treehouse C C
KDR TCGA T* T*
KDR TH03_TH34 S*
KDR Treehouse C*
KIT PEDAYA P P P*
KIT TCGA T T T*
KIT TH03_TH34 S S S*
KIT Treehouse C C C*D*
MAP2K2 TCGA T*
MAP2K2 TH03_TH34 S* S*
MAP2K2 Treehouse C*D*
MAP2K4 TCGA T
MAP2K4 TH03_TH34 S
MAP2K4 Treehouse CD*
MDM2 PEDAYA P* P
MDM2 TCGA T* T
MDM2 TH03_TH34 S* S
MDM2 Treehouse C*D* C
MS4A1 PEDAYA P
MS4A1 TCGA T*
MS4A1 TH03_TH34 S*
MS4A1 Treehouse C*
MTOR TCGA T*
MTOR TH03_TH34 S*
NOTCH3 TCGA T*
NOTCH3 TH03_TH34 S*
NOTCH3 Treehouse C*D*
NTRK2 TH03_TH34 S* S* S* S* S* S* S
NTRK2 Treehouse C
NTRK3 TCGA T T* T T
NTRK3 TH03_TH34 S*
NTRK3 Treehouse CD C* C C
PARP1 Treehouse D
PARP2 TCGA T* T*
PARP2 Treehouse C*D*
PDCD1 PEDAYA P
PDCD1 TCGA T
PDCD1 TH03_TH34 S
PDCD1 Treehouse C
PDGFRA TCGA T*
PIK3CD TCGA T* T* T*
PIK3CD TH03_TH34 S* S*
PIK3CD Treehouse C* C*
PIK3R1 TH03_TH34 S*
PIK3R2 TCGA T*
PIK3R2 TH03_TH34 S*
PIK3R2 Treehouse D* C*
PIK3R5 TCGA T* T*
PIK3R5 Treehouse C* C*
PTCH1 TCGA T* T* T*
PTCH1 Treehouse C* C*
RAF1 Treehouse D*
RPTOR TCGA T*
RPTOR TH03_TH34 S*
STAT1 TH03_TH34 S
STAT2 TCGA T
STAT2 TH03_TH34 S
STAT2 Treehouse CD
STAT5A Treehouse D*
TSC2 TCGA T* T* T*
TSC2 TH03_TH34 S* S*
TSC2 Treehouse C*D* C*
VEGFA TCGA T* T* T*
VEGFA TH03_TH34 S* S*
VEGFA Treehouse C* C* D*
VEGFC PEDAYA P*
VEGFC TCGA T*
VEGFC TH03_TH34 S*
VEGFC Treehouse C*D*
WEE1 TCGA T*
WEE1 TH03_TH34 S
WEE1 Treehouse C*D*

Annotate with combined cohort abbreviations

all_outliers_abbrev_combined_wide <- outliers %>%
  left_join(cohort_codes,
            by=c("comparison_cohort"="cohort_name")) %>%
  select(-pathway_support, -donor_ID,
         -comparison_cohort) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = cohort_code,
              values_fn = collapse_fun,
              values_fill = "")
all_outliers_abbrev_combined_wide %>%
  arrange(gene) %>%
  rename_all(underscore_to_space) %>%
  kbl() %>%
  kable_styling(full_width = F,
                bootstrap_options = "bordered")
gene high level cohort TH34 1162 S01 TH34 1149 S02 TH34 1238 S01 TH34 1349 S01 TH34 1349 S02 TH34 1379 S01 TH34 1380 S01 TH34 1150 S02 TH34 1399 S01 TH34 1400 S01 TH34 1412 S01 TH34 1414 S01 TH34 1415 S01 TH34 1444 S01 TH34 1452 S01 TH34 2292 S01 TH34 2351 S01 TH34 2411 S01 TH34 2666 S01 TH34 1163 S01 TH34 1179 S01 TH34 1239 S01 TH34 1350 S01 TH34 1351 S01 TH34 1352 S01 TH34 1381 S01 TH34 1445 S02 TH34 1446 S01 TH34 1447 S01 TH34 1447 S02 TH34 1455 S01 TH34 1456 S02 TH34 2293 S01 TH34 2410 S01
AKT1 TH03_TH34 S S
AKT1 Treehouse D
AKT2 PEDAYA P
AKT2 TCGA T
AKT2 TH03_TH34 S
AKT2 Treehouse C, D
ALK PEDAYA P
ALK TCGA T
ALK TH03_TH34 S
ALK Treehouse C
BCL6 TCGA T
BCL6 Treehouse D
BTK TCGA T T T
BTK TH03_TH34 S
BTK Treehouse C C C
CCND1 TCGA T
CCND1 TH03_TH34 S
CCND2 Treehouse D D
CCND3 TCGA T T
CCNE1 Treehouse D
CDK4 PEDAYA P
CDK4 TCGA T T
CDK4 TH03_TH34 S S
CDK4 Treehouse C C, D
CDK9 TCGA T T
CDK9 TH03_TH34 S
CDK9 Treehouse D C
CSF1R Treehouse D
DEPTOR TH03_TH34 S
ETV1 TCGA T T T T
ETV1 Treehouse C C
FGFR1 TCGA T T T
FGFR1 Treehouse C C
FGFR2 TCGA T
FGFR3 PEDAYA P
FGFR3 TCGA T T
FGFR3 TH03_TH34 S
FGFR4 PEDAYA P P P P P
FGFR4 Treehouse C
FLT4 PEDAYA P
FLT4 TCGA T T T
FLT4 TH03_TH34 S S
FLT4 Treehouse C, D C C
GATA2 TCGA T T
GATA2 Treehouse C, D
HDAC4 PEDAYA P
HDAC4 TCGA T
HDAC4 TH03_TH34 S
HDAC4 Treehouse C, D
HDAC7 Treehouse D
HMOX1 PEDAYA P P
HMOX1 TCGA T T T T T T
HMOX1 Treehouse C, D C, D C, D C D D C
HSP90B1 TCGA T
HSP90B1 TH03_TH34 S
HSP90B1 Treehouse C, D
IGF1 PEDAYA P P P
IGF1 TCGA T T
IGF1 TH03_TH34 S
IGF1 Treehouse C, D C
IGF2 TCGA T T T T T T T T T T T T T T T T T T
IGF2 Treehouse C D C C C C C C C C D C C C
IL6 PEDAYA P
IL6 TCGA T
IL6 TH03_TH34 S
IL6 Treehouse C
JAK1 PEDAYA P
JAK1 TCGA T T
JAK1 TH03_TH34 S S
JAK1 Treehouse C C
KDR TCGA T T
KDR TH03_TH34 S
KDR Treehouse C
KIT PEDAYA P P P
KIT TCGA T T T
KIT TH03_TH34 S S S
KIT Treehouse C C C, D
MAP2K2 TCGA T
MAP2K2 TH03_TH34 S S
MAP2K2 Treehouse C, D
MAP2K4 TCGA T
MAP2K4 TH03_TH34 S
MAP2K4 Treehouse C, D
MDM2 PEDAYA P P
MDM2 TCGA T T
MDM2 TH03_TH34 S S
MDM2 Treehouse C, D C
MS4A1 PEDAYA P
MS4A1 TCGA T
MS4A1 TH03_TH34 S
MS4A1 Treehouse C
MTOR TCGA T
MTOR TH03_TH34 S
NOTCH3 TCGA T
NOTCH3 TH03_TH34 S
NOTCH3 Treehouse C, D
NTRK2 TH03_TH34 S S S S S S S
NTRK2 Treehouse C
NTRK3 TCGA T T T T
NTRK3 TH03_TH34 S
NTRK3 Treehouse C, D C C C
PARP1 Treehouse D
PARP2 TCGA T T
PARP2 Treehouse C, D
PDCD1 PEDAYA P
PDCD1 TCGA T
PDCD1 TH03_TH34 S
PDCD1 Treehouse C
PDGFRA TCGA T
PIK3CD TCGA T T T
PIK3CD TH03_TH34 S S
PIK3CD Treehouse C C
PIK3R1 TH03_TH34 S
PIK3R2 TCGA T
PIK3R2 TH03_TH34 S
PIK3R2 Treehouse D C
PIK3R5 TCGA T T
PIK3R5 Treehouse C C
PTCH1 TCGA T T T
PTCH1 Treehouse C C
RAF1 Treehouse D
RPTOR TCGA T
RPTOR TH03_TH34 S
STAT1 TH03_TH34 S
STAT2 TCGA T
STAT2 TH03_TH34 S
STAT2 Treehouse C, D
STAT5A Treehouse D
TSC2 TCGA T T T
TSC2 TH03_TH34 S S
TSC2 Treehouse C, D C
VEGFA TCGA T T T
VEGFA TH03_TH34 S S
VEGFA Treehouse C C D
VEGFC PEDAYA P
VEGFC TCGA T
VEGFC TH03_TH34 S
VEGFC Treehouse C, D
WEE1 TCGA T
WEE1 TH03_TH34 S
WEE1 Treehouse C, D

Summary table for all outliers and low level cohorts

n_outliers_detected_by_any_method <- outliers %>%
  select(Sample_ID, gene) %>%
  distinct %>%
  nrow()

n_outliers_with_pathway_support_detected_by_any_method <- outliers %>%
  filter(pathway_support) %>%
  select(Sample_ID, gene) %>%
  distinct %>%
  nrow()
# these have pathway support in at least one cohort


outlier_summary <- outliers %>% 
  group_by(comparison_cohort) %>%
  summarize(n_outliers_detected = n(),
         n_outliers_with_pathway_support = sum(pathway_support),
         pct_outliers_with_pathway_support = 100*n_outliers_with_pathway_support/n_outliers_detected,
         pct_outliers_detected = 100*n_outliers_detected/n_outliers_detected_by_any_method)

outlier_summary_with_totals <- 
bind_rows(outlier_summary,
          tibble(comparison_cohort= " Total",
                 n_outliers_detected = n_outliers_detected_by_any_method,
                 n_outliers_with_pathway_support = n_outliers_with_pathway_support_detected_by_any_method,
                 pct_outliers_with_pathway_support = 100*n_outliers_with_pathway_support_detected_by_any_method/n_outliers_detected_by_any_method))
                 
                 
  
outlier_summary_with_totals %>% 
  rename_all(underscore_to_space) %>%
  kbl(digits = c(NA, 0, 0, 0, 0)) %>%
  kable_styling(full_width = F)
comparison cohort n outliers detected n outliers with pathway support pct outliers with pathway support pct outliers detected
PEDAYA 26 12 46 20
TCGA 98 74 76 75
TH03_TH34 53 39 74 41
Treehouse_pc 72 47 65 55
Treehouse_pd 38 29 76 29
Total 130 101 78 NA

Summary table for all outliers and high level cohorts

n_outliers_detected_by_any_method <- outliers %>%
  select(Sample_ID, gene) %>%
  distinct %>%
  nrow()

n_outliers_with_pathway_support_detected_by_any_method <- outliers %>%
  filter(pathway_support) %>%
  select(Sample_ID, gene) %>%
  distinct %>%
  nrow()
# these have pathway support in at least one cohort


high_level_outlier_summary <- outliers %>% 
  group_by(high_level_cohort, Sample_ID, gene) %>%
  summarize(pathway_support = any(pathway_support)) %>%
  group_by(high_level_cohort) %>%
  summarize(n_outliers_detected = n(),
         n_outliers_with_pathway_support = sum(pathway_support),
         pct_outliers_with_pathway_support = 100*n_outliers_with_pathway_support/n_outliers_detected,
         pct_outliers_detected = 100*n_outliers_detected/n_outliers_detected_by_any_method)
## `summarise()` has grouped output by 'high_level_cohort', 'Sample_ID'. You can
## override using the `.groups` argument.
high_level_outlier_summary_with_totals <- 
bind_rows(high_level_outlier_summary %>%
            arrange(desc(high_level_cohort)),
          tibble(high_level_cohort= " Total",
                 n_outliers_detected = n_outliers_detected_by_any_method,
                 n_outliers_with_pathway_support = n_outliers_with_pathway_support_detected_by_any_method,
                 pct_outliers_with_pathway_support = 100*n_outliers_with_pathway_support_detected_by_any_method/n_outliers_detected_by_any_method))
                 
                 
  
high_level_outlier_summary_with_totals %>% 
  rename_all(underscore_to_space) %>%
  kbl(format.args = list(big.mark = ","), digits = c(NA, 0, 0, 0, 0)) %>%
  kable_styling(full_width = F)
high level cohort n outliers detected n outliers with pathway support pct outliers with pathway support pct outliers detected
Treehouse 89 64 72 68
TH03_TH34 53 39 74 41
TCGA 98 74 76 75
PEDAYA 26 12 46 20
Total 130 101 78 NA

Combined high and low level tables

high_low <- bind_rows(
  high_level_outlier_summary_with_totals %>% 
    rename(comparison_cohort=high_level_cohort) %>%
    mutate(index = c(1, 4:7)),
  outlier_summary_with_totals %>%
    filter(str_detect(comparison_cohort, "Treehouse")) %>%
    mutate(index = 2:3)
) %>% 
  arrange(index) %>%
  select(-index)

  

high_low %>%
  rename_all(underscore_to_space) %>%
  kbl(format.args = list(big.mark = ","), digits = c(NA, 0, 0, 0, 0)) %>%
  kable_styling(full_width = F)
comparison cohort n outliers detected n outliers with pathway support pct outliers with pathway support pct outliers detected
Treehouse 89 64 72 68
Treehouse_pc 72 47 65 55
Treehouse_pd 38 29 76 29
TH03_TH34 53 39 74 41
TCGA 98 74 76 75
PEDAYA 26 12 46 20
Total 130 101 78 NA

REPEAT ANALYSIS USING ONLY OUTLIERS WITH PATHWAY SUPPORT

Tile plot of outliers with pathway support

ggplot(outliers %>%
         filter(pathway_support)) +
  geom_tile(aes(x=comparison_cohort,
                y=gene, 
                fill = comparison_cohort)) +
  facet_wrap(~Sample_ID,
             nrow = 1) +
  theme(#axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    axis.text.x = element_blank(),
    strip.text.x = element_text(angle = 90),
        ) +
  xlab("")  +
  scale_fill_bright()

Heatmap shows number of cohorts in which outlier were detected

I can make this look better if we decide to use it, but it’s non-trivial

pathway_outliers_heatmap_data <- outliers %>%
  filter(pathway_support) %>%
  group_by(Sample_ID, gene) %>%
  summarize(n_outliers = n()) 
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
ggplot(pathway_outliers_heatmap_data) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = n_outliers), 
            color = "black")  +
  #theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 

raw_pathway_support_outliers_for_venn <- outliers %>%
  filter(pathway_support)  %>%
  mutate(sample_gene = paste(Sample_ID, gene, sep = "_")) %>%
  arrange(comparison_cohort) %>%
  select(sample_gene, comparison_cohort) %>%
  group_split(comparison_cohort)


list_of_pathway_support_outliers_for_venn <-  lapply(raw_pathway_support_outliers_for_venn, function(x) x %>% pull(sample_gene))
names(list_of_pathway_support_outliers_for_venn) <- outliers %>%
  filter(pathway_support) %>%
  arrange(comparison_cohort) %>%
  select(comparison_cohort) %>%
  distinct() %>%
  pull(comparison_cohort)

ggVennDiagram(list_of_pathway_support_outliers_for_venn,
              show_intersect = TRUE)
## Warning in geom_text(aes_string(label = "count", text = "text"), x =
## label_coord[, : Ignoring unknown aesthetics: text
ggVennDiagram(list_of_pathway_support_outliers_for_venn) + 
  scale_fill_distiller(palette = "Reds", direction = 1)

Annotate with combined full cohort names

outliers_with_pathway_support_combined_wide <- outliers %>%
  filter(pathway_support) %>%
  select(-pathway_support, -donor_ID) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = comparison_cohort,
              values_fn = collapse_fun)

outliers_with_pathway_support_combined_long <- outliers_with_pathway_support_combined_wide %>%
  pivot_longer(-gene,
               names_to = "Sample_ID",
               values_to = "comparison_cohorts") %>%
  na.omit()

How many outliers with pathway support are present in each combination of cohorts?

tabyl(outliers_with_pathway_support_combined_long,
      comparison_cohorts) %>%
  arrange(desc(n)) %>%
  adorn_pct_formatting() %>%
  adorn_totals() %>%
  kbl() %>%
  kable_styling(full_width = F)
comparison_cohorts n percent
TCGA 112 37.1%
TH03_TH34 67 22.2%
Treehouse 37 12.3%
Treehouse_pc 35 11.6%
PEDAYA 22 7.3%
Treehouse_pd 17 5.6%
Treehouse_pc, Treehouse_pd 12 4.0%
Total 302
ggplot(outliers_with_pathway_support_combined_long) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = comparison_cohorts))  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

n_distinct(outliers_with_pathway_support_combined_long$Sample_ID)                
## [1] 33

Annotate with combined cohort abbreviations

outliers_with_pathway_support_abbrev_combined_wide <- outliers %>%
  filter(pathway_support) %>%
  left_join(cohort_codes,
            by=c("comparison_cohort"="cohort_name")) %>%
  select(-pathway_support, -donor_ID,
         -comparison_cohort) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = cohort_code,
              values_fn = collapse_fun,
              values_fill = "")

Big table of outliers with pathway support

outliers_with_pathway_support_abbrev_combined_wide %>%
  arrange(gene) %>%
  rename_all(underscore_to_space) %>%
  kbl() %>%
  kable_styling(full_width = F,
                bootstrap_options = "bordered")
gene high level cohort TH34 1149 S02 TH34 1238 S01 TH34 1399 S01 TH34 1400 S01 TH34 1412 S01 TH34 1415 S01 TH34 1444 S01 TH34 1452 S01 TH34 2292 S01 TH34 2411 S01 TH34 1179 S01 TH34 1239 S01 TH34 1349 S01 TH34 1349 S02 TH34 1350 S01 TH34 1352 S01 TH34 1379 S01 TH34 1380 S01 TH34 1381 S01 TH34 1150 S02 TH34 1414 S01 TH34 1445 S02 TH34 1447 S01 TH34 1447 S02 TH34 1455 S01 TH34 1456 S02 TH34 2293 S01 TH34 2351 S01 TH34 2410 S01 TH34 1446 S01 TH34 1162 S01 TH34 1351 S01
AKT1 TH03_TH34 S S
AKT1 Treehouse D
AKT2 PEDAYA P
AKT2 TCGA T
AKT2 TH03_TH34 S
AKT2 Treehouse C, D
BCL6 TCGA T
BTK TCGA T T T
BTK TH03_TH34 S
BTK Treehouse C C C
CCND1 TCGA T
CCND1 TH03_TH34 S
CCND2 Treehouse D
CCND3 TCGA T T
CCNE1 Treehouse D
CDK4 PEDAYA P
CDK4 TCGA T T
CDK4 TH03_TH34 S S
CDK4 Treehouse C, D C
CDK9 TCGA T T
CSF1R Treehouse D
DEPTOR TH03_TH34 S
ETV1 TCGA T
ETV1 Treehouse C C
FGFR1 TCGA T T T
FGFR1 Treehouse C C
FGFR2 TCGA T
FGFR3 TCGA T
FGFR3 TH03_TH34 S
FGFR4 PEDAYA P
FLT4 PEDAYA P
FLT4 TCGA T
FLT4 TH03_TH34 S
FLT4 Treehouse C
GATA2 TCGA T
GATA2 Treehouse D
HDAC4 TCGA T
HDAC4 TH03_TH34 S
HDAC4 Treehouse D
HDAC7 Treehouse D
HMOX1 PEDAYA P
HMOX1 TCGA T T T T T
HMOX1 Treehouse C C C, D D D D
HSP90B1 TCGA T
HSP90B1 TH03_TH34 S
HSP90B1 Treehouse C, D
IGF1 PEDAYA P P P
IGF1 TCGA T T
IGF1 TH03_TH34 S
IGF1 Treehouse C C
IGF2 TCGA T T T T T T T T T T T T T T
IGF2 Treehouse C C C C C D C D C C
IL6 PEDAYA P
IL6 TCGA T
IL6 TH03_TH34 S
IL6 Treehouse C
KDR TCGA T T
KDR TH03_TH34 S
KDR Treehouse C
KIT PEDAYA P
KIT TCGA T
KIT TH03_TH34 S
KIT Treehouse C, D
MAP2K2 TCGA T
MAP2K2 TH03_TH34 S S
MAP2K2 Treehouse C, D
MAP2K4 Treehouse D
MDM2 PEDAYA P
MDM2 TCGA T
MDM2 TH03_TH34 S
MDM2 Treehouse C, D
MS4A1 TCGA T
MS4A1 TH03_TH34 S
MS4A1 Treehouse C
MTOR TCGA T
MTOR TH03_TH34 S
NOTCH3 TCGA T
NOTCH3 TH03_TH34 S
NOTCH3 Treehouse C, D
NTRK2 TH03_TH34 S S S S S S
NTRK3 TCGA T
NTRK3 TH03_TH34 S
NTRK3 Treehouse C
PARP2 TCGA T T
PARP2 Treehouse C, D
PDGFRA TCGA T
PIK3CD TCGA T T T
PIK3CD TH03_TH34 S S
PIK3CD Treehouse C C
PIK3R1 TH03_TH34 S
PIK3R2 TCGA T
PIK3R2 TH03_TH34 S
PIK3R2 Treehouse D C
PIK3R5 TCGA T T
PIK3R5 Treehouse C C
PTCH1 TCGA T T T
PTCH1 Treehouse C C
RAF1 Treehouse D
RPTOR TCGA T
RPTOR TH03_TH34 S
STAT5A Treehouse D
TSC2 TCGA T T T
TSC2 TH03_TH34 S S
TSC2 Treehouse C, D C
VEGFA TCGA T T T
VEGFA TH03_TH34 S S
VEGFA Treehouse C C D
VEGFC PEDAYA P
VEGFC TCGA T
VEGFC TH03_TH34 S
VEGFC Treehouse C, D
WEE1 TCGA T
WEE1 Treehouse C, D
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggVennDiagram_1.2.2 cowplot_1.1.1       gridExtra_2.3      
##  [4] kableExtra_1.3.4    khroma_1.10.0       janitor_2.1.0      
##  [7] forcats_0.5.2       stringr_1.5.0       dplyr_1.0.10       
## [10] purrr_0.3.5         readr_2.1.3         tidyr_1.2.1        
## [13] tibble_3.2.1        ggplot2_3.4.2       tidyverse_1.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.6.3            sf_1.0-9            lubridate_1.9.0    
##  [4] bit64_4.0.5         RColorBrewer_1.1-3  webshot_0.5.4      
##  [7] httr_1.4.4          tools_4.2.1         backports_1.4.1    
## [10] bslib_0.5.0         utf8_1.2.3          R6_2.5.1           
## [13] KernSmooth_2.23-20  lazyeval_0.2.2      DBI_1.1.3          
## [16] colorspace_2.1-0    withr_2.5.0         tidyselect_1.2.0   
## [19] bit_4.0.5           compiler_4.2.1      cli_3.6.1          
## [22] rvest_1.0.3         xml2_1.3.3          plotly_4.10.1      
## [25] labeling_0.4.2      sass_0.4.7          scales_1.2.1       
## [28] classInt_0.4-9      proxy_0.4-27        systemfonts_1.0.4  
## [31] digest_0.6.33       yulab.utils_0.0.6   rmarkdown_2.23     
## [34] svglite_2.1.0       pkgconfig_2.0.3     htmltools_0.5.5    
## [37] dbplyr_2.2.1        fastmap_1.1.1       highr_0.10         
## [40] htmlwidgets_1.6.2   rlang_1.1.1         readxl_1.4.1       
## [43] rstudioapi_0.14     jquerylib_0.1.4     farver_2.1.1       
## [46] generics_0.1.3      jsonlite_1.8.7      crosstalk_1.2.0    
## [49] vroom_1.6.0         googlesheets4_1.0.1 magrittr_2.0.3     
## [52] Rcpp_1.0.11         munsell_0.5.0       fansi_1.0.4        
## [55] lifecycle_1.0.3     stringi_1.7.12      yaml_2.3.7         
## [58] snakecase_0.11.0    grid_4.2.1          parallel_4.2.1     
## [61] crayon_1.5.2        haven_2.5.1         hms_1.1.2          
## [64] knitr_1.43          pillar_1.9.0        reprex_2.0.2       
## [67] glue_1.6.2          evaluate_0.21       data.table_1.14.6  
## [70] modelr_0.1.10       vctrs_0.6.3         tzdb_0.3.0         
## [73] cellranger_1.1.0    gtable_0.3.3        assertthat_0.2.1   
## [76] cachem_1.0.8        xfun_0.39           broom_1.0.1        
## [79] e1071_1.7-13        class_7.3-20        googledrive_2.0.0  
## [82] RVenn_1.1.0         viridisLite_0.4.2   gargle_1.2.1       
## [85] units_0.8-1         timechange_0.1.1    ellipsis_0.3.2